clear all
set more off

local path0="C:\Dropbox\GeneticsProject (1)\REStat\round_accepted_replication_files\"

cd "`path0'Workfiles\"
set more off

*==================================================================
*	create the file with complete information on ethnic shares
*==================================================================
insheet using "`path0'Data\EthnicFractionalization\Ehtnic_bridge_final_Jan2010.txt", clear names
drop v*

compress
save ethnic_bridge_full, replace

*==================================================================
*	country codes for the DNA data file
*==================================================================
insheet using "`path0'Data\CavalliSforza\Country_codes.txt", clear names
duplicates drop 
compress
save Country_codes_for_DNA_data, replace



*==================================================================
*  create the file with compelete DNA bridge + blood information
*==================================================================
insheet using "`path0'Data\CavalliSforza\Ethnic_bridge_DNA_final_Jan2010.txt", clear names
drop v?
replace state="United Kingdom" if state=="UK"
replace state="Ukraine" if state=="UKRAINE"
replace state="Hungary" if state=="HUNGARY"
replace state="Romania" if state=="ROMANIA"
replace state="Greece" if state=="GREECE"
replace state="Israel" if state=="ISRAEL"
replace state="Latvia" if state=="LATVIA"
replace state="Lithuania" if state=="Lithuanian"
replace state="Slovakia" if state=="SLOVAKIA"
replace state="South African R" if state=="SOUTH AFRICA"
replace state="Macedonia" if state=="MACEDONIA"
replace state="Spain" if state=="SPAIN"

drop if pop_code==.

save Ethnic_bridge_DNA_final_Jan2010.dta, replace

* open file with the C-S DNA data
use "DNAdata.dta", clear
replace state="United Kingdom" if state=="UK"
replace state="Ukraine" if state=="UKRAINE"
replace state="Hungary" if state=="HUNGARY"
replace state="Romania" if state=="ROMANIA"
replace state="Greece" if state=="GREECE"
replace state="Israel" if state=="ISRAEL"
replace state="Latvia" if state=="LATVIA"
replace state="Lithuania" if state=="Lithuanian"
replace state="Slovakia" if state=="SLOVAKIA"
replace state="South African R" if state=="SOUTH AFRICA"
replace state="Macedonia" if state=="MACEDONIA"
replace state="Spain" if state=="SPAIN"

capture drop _merge
joinby state population pop_code using Ethnic_bridge_DNA_final_Jan2010.dta, unmatched(both)
tab _merge


drop if _merge~=3
drop _merge

* aggregate DNA data for the same ethnic group within a country
egen sample_size_wgt2=sum(sample_size_wgt), by(state allele ethnic_bridge)
egen temp_alleleA=sum(sample_size_wgt2*freq), by(state allele ethnic_bridge)
gen temp_sample_size2=sample_size/sample_size_wgt2

gen allele_wgtA=temp_alleleA/sample_size_wgt2

gen temp_allele_sdA=temp_sample_size2*(freq-allele_wgtA)^2
egen temp_allele_sdA2=sum(temp_allele_sdA), by(state allele ethnic_bridge)
gen allele_wgt_sdA=(temp_allele_sdA2)^0.5

* aggregate DNA data for the same ethnic group across countries
egen sample_size_wgt3=sum(sample_size_wgt), by(allele ethnic_bridge)
egen temp_alleleB=sum(sample_size_wgt3*freq), by(allele ethnic_bridge)
gen temp_sample_size3=sample_size/sample_size_wgt3

gen allele_wgtB=temp_alleleB/sample_size_wgt3

gen temp_allele_sdB=temp_sample_size3*(freq-allele_wgtB)^2
egen temp_allele_sdB2=sum(temp_allele_sdB), by(allele ethnic_bridge)
gen allele_wgt_sdB=(temp_allele_sdB2)^0.5


drop temp_*

drop sample_size_wgt
rename sample_size_wgt2 sample_size_wgt
drop allele_wgt_sd
rename allele_wgt_sdA allele_wgt_sd

rename sample_size_wgt3 sample_size_wgt_glob
rename allele_wgt_sdB allele_wgt_sd_glob

destring ethnic_bridge, replace force
*** append country codes
capture drop _merge
joinby state using Country_codes_for_DNA_data, unmatched(both)
tab _merge
drop if _merge==2
capture drop _merge

save temp000, replace

collapse (first)  latitude longitude continent ///
			pop_description pop_code ///
			sample_size freq ///
			sample_size_wgt allele_wgtA allele_wgt_sd state, ///
			by(countrycode allele  ethnic_bridge)

compress
save temp_DNA_bridge, replace

use temp000, clear
collapse (first)  latitude longitude continent ///
			pop_description pop_code ///
			sample_size freq ///
			sample_size_wgt_glob allele_wgtB allele_wgt_sd_glob, ///
			by(allele  ethnic_bridge)

compress
save temp_DNA_bridge_global, replace

*==================================================================
*  			merge DNA and ethnic data
*==================================================================

*** create file with country codes (ISO 3-letter)
insheet using "`path0'Data\EthnicFractionalization\Country_codes_bridge.txt", names clear
capture drop v6

replace country="Bolivia" if country=="BOLIVIA"
replace country="China" if country=="CHINA"

replace country="Cuba" if country=="CUBA"
replace country="Haiti" if country=="HAITI"
replace country="India" if country=="INDIA"

* replace country="Jamaica" if country=="JAMAICA"
replace country="Honduras" if country=="HONDURAS"

replace country="Costa Rica" if country=="COSTARICA"
replace country="Dominican Rep." if country=="DOMINICAN REP."
replace country="Ecuador" if country=="ECUADOR"
replace country="El Salvador" if country=="EL SALVADOR"
replace country="Guatemala" if country=="GUATEMALA"
replace country="Guyana" if country=="GUYANA"
replace country="Honduras" if country=="HONDURAS"
replace country="Jamaica" if country=="JAMAICA"
replace country="Trinidad and Tobago" if country=="TRINIDAD & TOBAGO"
replace country="Uruguay" if country=="URUGUAY"
replace country="Venezuela" if country=="VENEZUELA"
replace country="Serbia-Montenegro" if country=="YUGOSLAVIA"
replace country="Iceland" if country=="ICELAND"
replace country="Luxemburg" if country=="LUXEMBURG"

save ethnic_country_codes_bridge, replace

*------------------------------
*** merge ISO country codes with ethnic bridge
use ethnic_bridge_full, clear

replace country="ARGENTINA" if country=="Argentina"
replace country="ARMENIA" if country=="Armenia"
replace country="AUSTRALIA" if country=="Australia"
replace country="BRAZIL" if country=="Brazil"
replace country="CANADA" if country=="Canada"
replace country="CHILE" if country=="Chile"
replace country="COLOMBIA" if country=="Colombia"
replace country="MEXICO" if country=="Mexico"
replace country="NAMIBIA" if country=="Namibia"
replace country="NEW ZEALAND" if country=="New Zealand"
replace country="NICARAGUA" if country=="Nicaragua"
replace country="PANAMA" if country=="Panama"
replace country="PARAGUAY" if country=="Paraguay"
replace country="PERU" if country=="Peru"



rename bridge ethnic_bridge
collapse (sum) gpro (first) group, by(country ethnic_bridge)
drop if ethnic_bridge==.


joinby country using ethnic_country_codes_bridge, unmatched(both)
drop if country==""
drop if country=="CZECHOSLOVAKIA"
drop if country=="PAPUA N.G."
drop if country=="USSR"
drop if country=="YUGOSLAV"
drop if country=="YUGOSLAVIA"

tab _merge
drop _merge

*---------------------------------------------------
*** merge DNA and ethnic data
capture drop _merge

save temp001, replace

joinby countrycode ethnic_bridge using temp_DNA_bridge, unmatched(master)

tab _merge
rename _merge _merge_state
save temp002, replace

keep if _merge_state==1
drop allele	latitude	longitude	continent	pop_description	pop_code	sample_size	freq	sample_size_wgt	allele_wgtA	allele_wgt_sd	state
save temp003, replace

use temp003, clear
joinby ethnic_bridge using temp_DNA_bridge_global, unmatched(master)
tab _merge
drop _merge
save temp004, replace

use temp002, clear
keep if _merge_state==3
append using temp004

replace sample_size_wgt=sample_size_wgt_glob if sample_size_wgt==.


drop state 


*==================================================================
*  			compute frequency of blood types
*			at the country level
*==================================================================

egen gpro1=sum(gpro), by(countrycode allele)
gen gpro2=gpro/gpro1
drop if gpro2==. 

gen freqblood=gpro2*freq
gen eff_sample_size=gpro2*freq*sample_size_wgt
drop if allele==""

collapse (sum) freqblood eff_sample_size (mean) latitude longitude ///
	  (first) continent country  cont landlocked muslim oecd*, ///
	   by(countrycode allele)

drop if countrycode==""
compress

reshape wide  freqblood eff_sample_size latitude longitude continent country, i(countrycode) j(allele) string

label var countrycode "Country code"

label var freqbloodA "Weighted frequency of blood type A"
label var freqbloodB "Weighted frequency of blood type B"
label var freqbloodO "Weighted frequency of blood type O"

rename freqbloodA bloodA 
rename freqbloodB bloodB 
rename freqbloodO bloodO 

drop latitudeB latitudeO
rename latitudeA latitude
label var latitude "latitude of country in Cavalli-Sforza et al sample of studies"

drop longitudeB longitudeO
rename longitudeA longitude
label var longitude "longitude of country in Cavalli-Sforza et al sample of studies"

drop continentB continentO
rename continentA continent 
label var continent "Continent"

drop countryB countryO
rename countryA country
label var country "country"

label var eff_sample_sizeA "Effective sample size in Cavalli-Sforza et al sample of studies: blood type A"
label var eff_sample_sizeB "Effective sample size in Cavalli-Sforza et al sample of studies: blood type B"
label var eff_sample_sizeO "Effective sample size in Cavalli-Sforza et al sample of studies: blood type O"

rename eff_sample_sizeA efs_A
rename eff_sample_sizeB efs_B
rename eff_sample_sizeO efs_O

label var landlocked "Dummy for landlocked countries"
label var muslim "Share of muslin population"

order country countrycode blood* efs*

capture drop continent
rename cont continent
label var continent "Continent: Europe, Asia, etc."
label define lblcontinent  1	Asia 2	Europe 3	Africa 4	America 5	Oceania
label values continent lblcontinent   


*====================================================================
*		Compute blood distance relative to the UK (and US)
*		Use: C-S et al data
*====================================================================

cor  bloodA bloodB, cov
matrix A=r(C)
matrix B=inv(A)
 
 
foreach var in UK USA {
	capture drop temp*
	gen temp_bloodA= bloodA if country=="`var'"
	gen temp_bloodB= bloodB if country=="`var'"
	gen temp_bloodO= bloodO if country=="`var'"

	egen temp1_bloodA=max(temp_bloodA)
	egen temp1_bloodB=max(temp_bloodB)
	egen temp1_bloodO=max(temp_bloodO)

	gen distE_`var'=sqrt( (bloodA-temp1_bloodA)^2 + (bloodB-temp1_bloodB)^2 )

	gen distM_`var'=sqrt(  (bloodA-temp1_bloodA)*(B[1,1]*(bloodA-temp1_bloodA) + B[2,1]*(bloodB-temp1_bloodB) ) ///
						 + (bloodB-temp1_bloodB)*(B[1,2]*(bloodA-temp1_bloodA) + B[2,2]*(bloodB-temp1_bloodB) ) )
		
	gen distA_`var'=abs(bloodA-temp1_bloodA)
	gen distB_`var'=abs(bloodB-temp1_bloodB)	
		
	capture drop temp*
}




label var distE_UK "Euclidian distance from UK"
label var distM_UK "Mahalanobis distance from UK"
label var distE_US "Euclidian distance from USA"
label var distM_US "Mahalanobis distance from USA"

label var distA_US "Linear distance from USA: blood type A"
label var distB_US "Linear distance from USA: blood type B"
label var distA_UK "Linear distance from UK: blood type A"
label var distB_UK "Linear distance from UK: blood type B"


compress

save DNA_blood_data, replace

